/* Copyright 2012 Tobias Marschall
 * 
 * This file is part of CLEVER.
 * 
 * CLEVER is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CLEVER is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CLEVER.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <iostream>
#include <fstream>
#include <vector>
#include <cassert>
#include <ctime>
#include <iomanip>

#include <boost/program_options.hpp>
#include <boost/unordered_set.hpp>
#include <boost/tokenizer.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/dynamic_bitset.hpp>

#include <bamtools/api/BamReader.h>

#include "Variation.h"
#include "VariationUtils.h"
#include "VariationListParser.h"
#include "LengthAwareVariationIndex.h"
#include "FastaReader.h"
#include "Genotyper.h"
#include "GenotypeDistribution.h"
#include "BamHelper.h"
#include "VersionInfo.h"

using namespace std;
using namespace boost;
namespace po = boost::program_options;

void usage(const char* name, const po::options_description& options_desc) {
	cerr << "Usage: " << name << " [options] <dataset-list-file> <ref.fasta(.gz)>" << endl;
	cerr << endl;
	cerr << "Reads predictions by CLEVER and from a split-read mapper (such as laser-core) for" << endl;
	cerr << "a list of data sets and outputs a merged set of deletion calls to stdout (VCF format)." << endl;
	cerr << endl;
	cerr << "<dataset-list-file> has to contain one row per data set with three columns:" << endl;
	cerr << "<name> <clever-predictions-file> <splitread-predictions-file> [<splitread-bam-file>]" << endl;
	cerr << endl;
	cerr << "If <splitread-bam-file> is given, additional statistics are written." << endl;
	cerr << endl;
	cerr << options_desc << endl;
	exit(1);
}

typedef struct dataset_t {
	string name;
	string clever_predictions_filename;
	string split_predictions_filename;
	string bam_filename;
	BamTools::BamReader* bam_reader;
	vector<Variation>* clever_predictions;
	LengthAwareVariationIndex* clever_index;
	vector<double>* clever_support;
	boost::dynamic_bitset<>* clever_used;
	Genotyper::read_name_set_t used_reads;
	dataset_t(string name, string clever_predictions_filename, string split_predictions_filename) : name(name), clever_predictions_filename(clever_predictions_filename), split_predictions_filename(split_predictions_filename), bam_filename(""), bam_reader(0), clever_predictions(0), clever_index(0), clever_support(0), clever_used(0) {}
	dataset_t(string name, string clever_predictions_filename, string split_predictions_filename, string bam_filename) : name(name), clever_predictions_filename(clever_predictions_filename), split_predictions_filename(split_predictions_filename), bam_filename(bam_filename), bam_reader(new BamTools::BamReader()), clever_predictions(0), clever_index(0), clever_support(0), clever_used(0) {
		if ((bam_filename.size() < 4) || (bam_filename.substr(bam_filename.size()-4,4).compare(".bam") != 0)) {
			ostringstream oss;
			oss << "Filename doesn't end on \".bam\": \"" << bam_filename << "\".";
			throw std::runtime_error(oss.str());
		}
		if (!bam_reader->Open(bam_filename)) {
			ostringstream oss;
			oss << "Could not read BAM input from \"" << bam_filename << "\".";
			throw std::runtime_error(oss.str());
		}
		if (!bam_reader->OpenIndex(bam_filename + ".bai")) {
			string index_filename(bam_filename);
			index_filename.replace(bam_filename.size()-3,3,"bai");
			if (!bam_reader->OpenIndex(index_filename)) {
				ostringstream oss;
				oss << "Could not locate index for BAM file \"" << bam_filename << "\".";
				throw std::runtime_error(oss.str());
			}
		}
	}
} dataset_t;

typedef struct variation_stats_t {
	int support_sum;
	// for each data set the support for that variation
	vector<int> split_support;
	variation_stats_t(size_t dataset_count) : support_sum(0), split_support(dataset_count,0) {}
} variation_stats_t;

typedef struct variation_and_stats_t {
	Variation variation;
	variation_stats_t* stats;
	variation_and_stats_t(Variation variation, variation_stats_t* stats) : variation(variation), stats(stats) {}
} variation_and_stats_t;

typedef struct {
	bool operator()(const variation_and_stats_t& v1, const variation_and_stats_t& v2) {
		assert(v1.stats != 0);
		assert(v2.stats != 0);
		return v1.stats->support_sum > v2.stats->support_sum;
	}
} variation_and_stats_comparator_t;

// information that is output per combination of variation and data set
typedef struct output_stats_t {
	string genotype;
	Genotyper::variation_stats_t genotyper_stats;
	int approximate_split_support;
	int clique_size;
	GenotypeDistribution genotype_distribution;
	output_stats_t() : genotype("."), approximate_split_support(0), clique_size(0) {}
} output_stats_t;

ostream& operator<<(ostream& os, const dataset_t& d) {
	if (d.bam_filename.size() == 0) {
		os << d.name << " " << d.clever_predictions_filename << " " << d.split_predictions_filename << " BAM_unknown";
	} else {
		os << d.name << " " << d.clever_predictions_filename << " " << d.split_predictions_filename << " " << d.bam_filename;
	}
	return os;
}

string genotype2str[3] = {".", "1/.", "1/1"};

void read_dataset_list(const string& filename, vector<dataset_t>* result) {
	assert(result != 0);
	ifstream is(filename.c_str());
	if (is.fail()) {
		cerr << "Error: could not open \"" << filename << "\"." << endl;
		exit(1);
	}
	typedef boost::tokenizer<boost::char_separator<char> > tokenizer_t;
	boost::char_separator<char> whitespace_separator(" \t");
	string line;
	int linenr = 0;
	while (getline(is,line)) {
		linenr += 1;
		tokenizer_t tokenizer(line, whitespace_separator);
		vector<string> tokens(tokenizer.begin(), tokenizer.end());
		if (tokens.size() == 3) {
			result->push_back(dataset_t(tokens[0], tokens[1], tokens[2]));
		} else if (tokens.size() == 4) {
			result->push_back(dataset_t(tokens[0], tokens[1], tokens[2], tokens[3]));
		} else {
			ostringstream oss;
			oss << "Error parsing data set list. Offending line: " << linenr;
			throw std::runtime_error(oss.str());
		}
		int i = result->size() - 1;
		cerr << "Data set " << i << ": " << result->at(i) << endl;
	}
}

auto_ptr<vector<size_t> > find_similar(const Variation& v, const vector<Variation>& list, LengthAwareVariationIndex& index, int max_distance, int max_length_diff) {
	auto_ptr<vector<size_t> > results(new vector<size_t>());
	assert(v.getType() == Variation::DELETION);
	int length1 = v.getCoordinate2() - v.getCoordinate1();
	double center1 = ((double)(v.getCoordinate1() + v.getCoordinate2())) / 2.0;
	int windowsize = max_distance + (length1 + max_length_diff) / 2 + 1;
	auto_ptr<vector<size_t> > candidates = index.containedIn(v.getChromosome(), v.getCoordinate1()-windowsize, v.getCoordinate2()+windowsize, max(length1-max_length_diff, 1), length1+max_length_diff);
	if (candidates.get() == 0) return results;
	for (size_t i=0; i<candidates->size(); ++i) {
		assert(candidates->at(i) < list.size());
		const Variation& v2 = list[candidates->at(i)];
		if (v2.getType() != Variation::DELETION) continue;
		int length2 = v2.getCoordinate2() - v2.getCoordinate1();
		if (abs(length1-length2) > max_length_diff) continue;
		int center2 = ((double)(v2.getCoordinate1() + v2.getCoordinate2())) / 2.0;
		if (abs(center1-center2) > max_distance) continue;
		results->push_back(candidates->at(i));
	}
	return results;
}

string get_date_string() {
	time_t t = time(0);
	struct tm* now = localtime(&t);
	ostringstream oss;
	oss << (now->tm_year+1900) << setfill('0') << setw(2) << (now->tm_mon+1) << setw(2) << now->tm_mday;
	return string(oss.str());
}

int main(int argc, char* argv[]) {
	VersionInfo::checkAndPrintVersion("merge-to-vcf", cerr);
	string commandline = VersionInfo::commandline(argc, argv);

	// PARAMETERS
	int max_distance_split_clever;
	int max_length_diff_split_clever;
	int max_distance_split_split;
	int max_length_diff_split_split;
	int min_length;
	double internal_segment_length_mean;
	double internal_segment_length_stddev;
	int bam_window_size;
	bool genotyping = false;
	bool trio_aware = false;
	bool split_reads_from_bam = false;
	int min_mapq;
	double denovo_threshold;
	double variant_prior;
	bool genotyping_only_inserts = false;
	bool genotyping_only_splits = false;
	
	po::options_description options_desc("Allowed options");
	options_desc.add_options()
		("max_offset,o", po::value<int>(&max_distance_split_clever)->default_value(100), "Maximum allowed distance between split read and CLEVER call.")
		("max_length_diff,z", po::value<int>(&max_length_diff_split_clever)->default_value(20), "Maximum allowed length difference between split read and CLEVER call.")
		("max_offset_split,O", po::value<int>(&max_distance_split_split)->default_value(20), "Maximum allowed distance between two split read calls.")
		("max_length_diff_split,Z", po::value<int>(&max_length_diff_split_split)->default_value(5), "Maximum allowed length difference between two split read calls.")
		("min_length,l", po::value<int>(&min_length)->default_value(10), "Minimum length of variations to be written.")
		("isize_mean", po::value<double>(&internal_segment_length_mean)->default_value(-1.0), "Mean length of internal segments. If given and BAM files are present, number of supporting internal segment size reads is determined.")
		("isize_stddev", po::value<double>(&internal_segment_length_stddev)->default_value(-1.0), "Standard deviation of internal segments.")
		("bam_window,w", po::value<int>(&bam_window_size)->default_value(1000), "Number of nucleotides to look to the right/to the left of deletions in BAM files to find relevant alignments.")
		("genotype,G", po::value<bool>(&genotyping)->zero_tokens(), "Perform genotyping.")
		("trio_aware,T", po::value<bool>(&trio_aware)->zero_tokens(), "Perform trio-aware genotyping.")
		("denovo_threshold,d", po::value<double>(&denovo_threshold)->default_value(1e-5), "Threshold for de novo calls (in trio mode)")
		("split_reads_from_bam,b", po::value<bool>(&split_reads_from_bam)->zero_tokens(), "Read split read evidence from BAM file.")
		("mapq,m", po::value<int>(&min_mapq)->default_value(0), "Minimum MAPQ. Alignments with lower MAPQ are ignored.")
		("variant_prior,p", po::value<double>(&variant_prior)->default_value(0.5), "Prior believe in a variant (given that the locus is under investigation).")
		("gt_only_insert,I", po::value<bool>(&genotyping_only_inserts)->zero_tokens(), "Do genotyping only based on internal-segment-type alignments.")
		("gt_only_split,S", po::value<bool>(&genotyping_only_splits)->zero_tokens(), "Do genotyping only based on split-type alignments.")
	;

	if (argc<3) {
		usage(argv[0], options_desc);
	}
	string dataset_list_filename(argv[argc-2]);
	string reference_filename(argv[argc-1]);
	argc -= 2;

	po::variables_map options;
	try {
		po::store(po::parse_command_line(argc, argv, options_desc), options);
		po::notify(options);
	} catch(std::exception& e) {
		cerr << "error: " << e.what() << "\n";
		return 1;
	}

	if (trio_aware) genotyping = true;
	if (genotyping && ((internal_segment_length_mean < 0.0) || (internal_segment_length_stddev < 0.0))) {
		cerr << "Error: to perform genotyping, mean and stddev of internal segment length distribution must be provided." << endl;
		return 1;
	}
	cerr << "Commandline: " << commandline << endl;

	cerr << "Warning: only outputs deletions. Insertions not implemented yet." << endl;
	
	typedef boost::unordered_map<Variation, variation_stats_t*> split_variation_map_t;
	split_variation_map_t split_variation_map;
	vector<dataset_t> datasets;
	read_dataset_list(dataset_list_filename, &datasets);
	typedef boost::unordered_map<string,int> dataset_name_map_t;
	dataset_name_map_t dataset_name_map;

	bool bam_present = false;
	for (size_t i=0; i<datasets.size(); ++i) {
		dataset_t& d = datasets[i];
		dataset_name_map_t::const_iterator name_it = dataset_name_map.find(d.name);
		if (name_it != dataset_name_map.end()) {
			cerr << "Error: duplicate dataset name: \"" << d.name << "\"." << endl;
			return 1;
		}
		dataset_name_map[d.name] = i;
		if (d.bam_reader != 0) bam_present = true;
		// read clever predictions
		ifstream clever_predictions_is(d.clever_predictions_filename.c_str());
		if (clever_predictions_is.fail()) {
			cerr << "Error: could not open \"" << d.clever_predictions_filename << "\"." << endl;
			return 1;
		}
		d.clever_support = new vector<double>();
		auto_ptr<vector<Variation> > variations = VariationListParser::parse(clever_predictions_is, false, min_length, d.clever_support);
		d.clever_predictions = new vector<Variation>(variations->begin(), variations->end());
		d.clever_index = new LengthAwareVariationIndex(*d.clever_predictions);
		d.clever_used = new boost::dynamic_bitset<>(d.clever_predictions->size());
		cerr << "Read " << d.clever_predictions->size() << " CLEVER predictions from " << d.clever_predictions_filename << endl;
		// read split read predictions
		if (split_reads_from_bam) {
			if (d.bam_reader == 0) {
				cerr << "No BAM file present for dataset \"" << d.name << "\" although option -b was given." << endl;
				return 1;
			}
			cerr << "Reading deletions from BAM file for dataset \"" << d.name << "\"." << endl;
			BamTools::BamAlignment aln;
			while (d.bam_reader->GetNextAlignment(aln)) {
				if (!aln.IsMapped()) continue;
				if ((min_mapq > 0) && (aln.MapQuality < min_mapq)) continue;
				auto_ptr<vector<Variation> > aln_vars = BamHelper::variationsFromAlignment(d.bam_reader->GetReferenceData(), aln);
				vector<Variation>::const_iterator it = aln_vars->begin();
				for (; it != aln_vars->end(); ++it) {
					if (it->getType() != Variation::DELETION) continue;
					split_variation_map_t::const_iterator map_it = split_variation_map.find(*it);
					if (map_it == split_variation_map.end()) {
						variation_stats_t* stats = new variation_stats_t(datasets.size()); 
						stats->split_support[i] += 1;
						stats->support_sum += 1;
						split_variation_map[*it] = stats;
					} else {
						map_it->second->split_support[i] += 1;
						map_it->second->support_sum += 1;
					}
				}
			}
		} else {
			ifstream split_predictions_is(d.split_predictions_filename.c_str());
			if (split_predictions_is.fail()) {
				cerr << "Error: could not open \"" << d.split_predictions_filename << "\"." << endl;
				return 1;
			}
			vector<double> split_support;
			variations = VariationListParser::parse(split_predictions_is, false, min_length, &split_support, Variation::DELETION);
			cerr << "Read " << variations->size() << " split-read predictions from " << d.split_predictions_filename << endl;
			assert(split_support.size() == variations->size());
			for (size_t j=0; j<variations->size(); ++j) {
				const Variation& v = variations->at(j);
				split_variation_map_t::const_iterator it = split_variation_map.find(v);
				int support = (int)roundf(split_support[j]);
				if (it == split_variation_map.end()) {
					variation_stats_t* stats = new variation_stats_t(datasets.size()); 
					stats->split_support[i] += support;
					stats->support_sum += support;
					split_variation_map[v] = stats;
				} else {
					it->second->split_support[i] += support;
					it->second->support_sum += support;
				}
			}
		}
	}
	int mother_idx = -1;
	int father_idx = -1;
	int child_idx = -1;
	if (trio_aware) {
		if (datasets.size() != 3) {
			cerr << "Error: expecting exactly 3 datasets in trio mode." << endl;
			return 1;
		}
		dataset_name_map_t::const_iterator e = dataset_name_map.end();
		if ((dataset_name_map.find("mother") == e) || (dataset_name_map.find("father") == e) || (dataset_name_map.find("child") == e)) {
			cerr << "Error: expecting datasets to be named \"mother\", \"father\", \"child\" in trio mode." << endl;
			return 1;
		}
		mother_idx = dataset_name_map["mother"];
		father_idx = dataset_name_map["father"];
		child_idx = dataset_name_map["child"];
	}
	// create vector with all variations supported by split reads
	vector<variation_and_stats_t> all_splitread_variation_and_stats;
	for (split_variation_map_t::const_iterator it = split_variation_map.begin(); it != split_variation_map.end(); ++it) {
		if (it->first.getType() != Variation::DELETION) continue;
		all_splitread_variation_and_stats.push_back(variation_and_stats_t(it->first, it->second));
	}
	cerr << "Total split read deletions (over all data sets): " << all_splitread_variation_and_stats.size() << endl;
	// sort variations by sum of split-read support
	sort(all_splitread_variation_and_stats.begin(), all_splitread_variation_and_stats.end(), variation_and_stats_comparator_t());
//	for (size_t i=0; i<all_splitread_variation_and_stats.size(); ++i) {
//		cerr << all_splitread_variation_and_stats[i].stats->support_sum << " " << all_splitread_variation_and_stats[i].variation << endl;
//	}
	// build split-read variation vector in sorted order
	vector<Variation> all_splitread_variation;
	for (size_t i=0; i<all_splitread_variation_and_stats.size(); ++i) {
		all_splitread_variation.push_back(all_splitread_variation_and_stats[i].variation);
	}
	// build variation index
	LengthAwareVariationIndex all_splitread_variation_index(all_splitread_variation);
	// bitarray keeping track of which split-read variations have already been used
	boost::dynamic_bitset<> splitreads_used(all_splitread_variation.size());
	// load reference genome
	auto_ptr<FastaReader::reference_map_t> reference_map = FastaReader::parseFromFile(reference_filename);
	cerr << "Read " << reference_map->size() << " reference sequences." << endl;
	// output header
	cout << "##fileformat=VCFv4.1" << endl;
	cout << "##fileDate=" << get_date_string() << endl;
	cout << "##source=clever-merge-to-vcf-" << VersionInfo::version() << " cmdline: " << commandline << endl;
	cout << "##reference=" << reference_filename << endl;
	cout << "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">" << endl;
	cout << "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">" << endl;
	cout << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl;
	cout << "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Allele depth (internal segment), how many reads support this allele via their internal segment\">" << endl;
	cout << "##FORMAT=<ID=SD,Number=1,Type=Integer,Description=\"Allele depth (split read), how many reads support this allele via a split alignment\">" << endl;
	cout << "##FORMAT=<ID=AS,Number=1,Type=Integer,Description=\"Approximate split alignments, how many reads almost support this event via a split alignment (additional to those in SD)\">" << endl;
	cout << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
	for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
		cout << '\t' << datasets[dataset_idx].name;
	}
	cout << endl;
	string format_string = "GT:AD:SD:AS";
	bool output_internal_segment_support = bam_present && (internal_segment_length_mean > 0);
	if (bam_present) {
		format_string += ":RC:IC";
	}
	if (output_internal_segment_support) {
		format_string += ":IS";
	}
	Genotyper* genotyper = 0;
	if (genotyping) {
		genotyper = new Genotyper(internal_segment_length_mean, internal_segment_length_stddev, variant_prior, max_length_diff_split_split, max_distance_split_split, min_mapq, bam_window_size, !genotyping_only_splits, !genotyping_only_inserts);
		genotyper->enableMasking(500, 10000);
	}
	OverlappingRegions masked_regions;
	// iterate over all split-read variations ordered by total support (sum over all datasets)
	for (size_t i=0; i<all_splitread_variation.size(); ++i) {
		const Variation& v = all_splitread_variation[i];
		// has variation been used before (i.e., similar to another one)?
		if (splitreads_used[i]) {
			// cout << "Skipping " << v << endl;
			continue;
		}
		// create vector with records to be output per dataset
		vector<output_stats_t> output_stats(datasets.size(), output_stats_t());
		// copy split-read support counts to output records
		for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
			output_stats[dataset_idx].genotyper_stats.split_evidence.alt_support = all_splitread_variation_and_stats[i].stats->split_support[dataset_idx];
		}
		// find similar split-read variations
		auto_ptr<vector<size_t> > similar_split = find_similar(v, all_splitread_variation, all_splitread_variation_index, max_distance_split_split, max_length_diff_split_split);
		for (size_t j=0; j<similar_split->size(); ++j) {
			size_t k = similar_split->at(j);
			if (k == i) continue;
			if (splitreads_used[k]) continue;
			const variation_stats_t* split_stats = all_splitread_variation_and_stats[k].stats;
			assert(split_stats != 0);
			for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
				output_stats[dataset_idx].approximate_split_support += split_stats->split_support[dataset_idx];
			}
			splitreads_used.set(k, true);
		}
		// for each dataset, find similar CLEVER calls
		for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
			dataset_t& d = datasets[dataset_idx];
			auto_ptr<vector<size_t> > similar_clever = find_similar(v, *d.clever_predictions, *d.clever_index, max_distance_split_clever, max_length_diff_split_clever);
			for (size_t j=0; j<similar_clever->size(); ++j) {
				size_t k = similar_clever->at(j);
				if ((*d.clever_used)[k]) continue;
				int support = (int)roundf(d.clever_support->at(k));
				output_stats[dataset_idx].clique_size += support;
				d.clever_used->set(k, true);
			}
		}
		int length = v.getCoordinate2()-v.getCoordinate1();
		int center = (v.getCoordinate1() + v.getCoordinate2()) / 2;
		// TODO
// 		if (forbidden.find(center) != forbidden.end()) {
// 			cerr << "Skipping forbidden position " << center << endl;
// 			continue;
// 		}
		// if bam files are present determine coverage etc.
		bool skip = false;
		for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
			dataset_t& d = datasets[dataset_idx];
			if (d.bam_reader != 0) {
				int ref_id = d.bam_reader->GetReferenceID(v.getChromosome());
				if (ref_id < 0) {
					cerr << "Error: reference " << v.getChromosome() << " not found in BAM file." << endl;
					return 1;
				}
				assert(genotyper != 0);
				auto_ptr<Genotyper::variation_stats_t> genotyper_stats =  genotyper->extractVariationStats(v, *(d.bam_reader),  &masked_regions, &(d.used_reads));
				if (genotyper_stats.get() == 0) {
					skip = true;
					break;
				}
				output_stats[dataset_idx].genotyper_stats = *genotyper_stats;
			}
		}
		if (skip) {
			cerr << "Warning: skipping variation (either coverage too high or reference unknown)." << endl;
			continue;
		}
		// do genotyping
		if (genotyping) {
			for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
				output_stats_t& o = output_stats[dataset_idx];
				cerr << "Genotyping deletion at " << v.getChromosome() << ' ' <<  v.getCoordinate1() - 1 << endl;
				cerr << "  insert size (REF/ALT):" << o.genotyper_stats.insert_evidence.ref_support << "/" << o.genotyper_stats.insert_evidence.alt_support << endl;
				cerr << "  split reads (REF/ALT):" << o.genotyper_stats.split_evidence.ref_support << "/" << o.genotyper_stats.split_evidence.alt_support << endl;

				auto_ptr<GenotypeDistribution> genotype_distribution = genotyper->computeGenotype(v, o.genotyper_stats);
				assert(genotype_distribution.get() != 0);
				o.genotype_distribution = *genotype_distribution;
				o.genotype = genotype_distribution->likeliestGenotypeString();
			}
			// if trio-aware, then re-genotype all three samples
			if (trio_aware) {
				auto_ptr<Genotyper::trio_genotype_t> trio_genotype = Genotyper::genotypeTrio(
					output_stats[mother_idx].genotype_distribution,
					output_stats[father_idx].genotype_distribution,
					output_stats[child_idx].genotype_distribution,
					denovo_threshold
				);
				output_stats[mother_idx].genotype = genotype2str[trio_genotype->mother];
				output_stats[father_idx].genotype = genotype2str[trio_genotype->father];
				output_stats[child_idx].genotype = genotype2str[trio_genotype->child];
			}
		} else {
			for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
				output_stats_t& o = output_stats[dataset_idx];
				if ((o.clique_size >= 5) && (o.genotyper_stats.split_evidence.alt_support >= 1)) {
					o.genotype = "1/.";
				}
			}
		}
		// determine whether any individual has this deletion
		bool print_deletion = false;
		for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
			if ((output_stats[dataset_idx].genotype.compare("1/.") == 0) ||
				(output_stats[dataset_idx].genotype.compare("1/1") == 0) || 
				(output_stats[dataset_idx].genotype.compare("1/0") == 0)
			) {
				print_deletion = true;
			}
		}
		if (print_deletion) {
			FastaReader::reference_map_t::const_iterator ref = reference_map->find(v.getChromosome());
			if (ref == reference_map->end()) {
				cerr << "Unknown reference sequence: " << v.getChromosome() << endl;
				return 1;
			}
			cout << v.getChromosome(); // CHROM
			cout << '\t' << v.getCoordinate1(); // POS
			cout << '\t' << "."; // ID
			cout << '\t' << ref->second->substr(v.getCoordinate1()-1, length + 1); // REF
			cout << '\t' << (*ref->second)[v.getCoordinate1()-1]; // ALT
			cout << '\t' << "."; // QUAL
			cout << '\t' << "PASS"; // FILTER
			cout << "\tSVTYPE=DEL;SVLEN=" << -length; // INFO
			cout << '\t' << format_string; // FORMAT
			for (size_t dataset_idx=0; dataset_idx<datasets.size(); ++dataset_idx) {
				output_stats_t& o = output_stats[dataset_idx];
				cout << '\t' << (o.genotype) << ":" << o.clique_size << ":" << o.genotyper_stats.split_evidence.alt_support << ":" << o.approximate_split_support;
				if (bam_present) {
					cout << ":" << o.genotyper_stats.split_evidence.coverage() << ":" << o.genotyper_stats.insert_evidence.coverage();
					if (output_internal_segment_support) {
						cout << ":" << o.genotyper_stats.insert_evidence.alt_support;
					}
				}
			}
			cout << endl; 
		} else {
			// cout << "Not printing " << v << endl;
		}
	}
	return 0;
}
